Heloderma suspectum, the Gila Monster
obs_csv <- file.path(dir_data, "obs.csv")
obs_geo <- file.path(dir_data, "obs.geojson")
redo <- FALSE
if (!file.exists(obs_geo) | redo){
# get species occurrence data from GBIF with coordinates
(res <- spocc::occ(
query = 'Heloderma suspectum',
from = 'gbif', has_coords = T,
limit = 10000))
# extract data frame from result
df <- res$gbif$data[[1]]
readr::write_csv(df, obs_csv)
# convert to points of observation from lon/lat columns in data frame
obs <- df %>%
sf::st_as_sf(
coords = c("longitude", "latitude"),
crs = st_crs(4326)) %>%
select(prov, key) %>% # save space (joinable from obs_csv)
subset(!key %in% c(861689291, 1050659944, 1145541957, 686703568))
sf::write_sf(obs, obs_geo, delete_dsn=T)
}
obs <- sf::read_sf(obs_geo)
nrow(obs) # number of rows
## [1] 1258
mapview::mapview(obs, map.types = "OpenTopoMap")
Several observations were rerecorded with a lat/long of (0,0) as indicated by the dots off the coast of Africa (visible if the above code is run without line 58). Additionally, an investigation of Gila Monster habitat on Google shows that the points in the Midwest and California coast are far out of habitable range. Sources disagree on whether or not the true range extends into southern Mexico, so for the purposes of this analysis those observations will remain in the dataset.
After removing these four bad observations, we are left with 1258 observations of Heloderma suspectum from the GBIF database.
I used the same default layers presented in the lab template: WC_alt, WC_bio1, WC_bio2, ER_tri, ER_topoWet.
I concluded that these would be appropriate metrics because1:
Are all different ways of measuring climactic variables that are associated with deserts, the Gila Monster’s native habitat. Including all three should allow us to identify which features of the desert might be the most important in identifying suitable habitat.
Both measure adjacent metrics that could help narrow down the range: for example, as slow-moving reptiles they may prefer a sweet spot of terrain roughness that isn’t too hard for them to navigate but also provides enough nooks and crannies to hide in. Altitude is somewhat of an intermediary between terrain and temperature variables, with lower elevations often flatter in the Southwest and also warmer.
# choosing terrestrial
env_datasets <- sdmpredictors::list_datasets(terrestrial = TRUE, marine = FALSE)
# choose datasets for a vector
env_datasets_vec <- c("WorldClim", "ENVIREM")
# get layers
env_layers <- sdmpredictors::list_layers(env_datasets_vec)
# choose layers after some inspection and perhaps consulting literature
env_layers_vec <- c("WC_alt", "WC_bio1", "WC_bio2", "ER_tri", "ER_topoWet")
# get layers
env_stack <- load_layers(env_layers_vec)
# interactive plot layers, hiding all but first (select others)
# mapview(env_stack, hide = T) # makes the html too big for Github
plot(env_stack, nc=2)
obs_hull_geo <- file.path(dir_data, "obs_hull.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")
if (!file.exists(obs_hull_geo) | redo){
# make convex hull around points of observation
obs_hull <- sf::st_convex_hull(st_union(obs))
# save obs hull
write_sf(obs_hull, obs_hull_geo)
}
obs_hull <- read_sf(obs_hull_geo)
# show points on map
mapview(
list(obs, obs_hull), map.types = "OpenTopoMap")
if (!file.exists(env_stack_grd) | redo){
obs_hull_sp <- sf::as_Spatial(obs_hull)
env_stack <- raster::mask(env_stack, obs_hull_sp) %>%
raster::crop(extent(obs_hull_sp))
writeRaster(env_stack, env_stack_grd, overwrite=T)
}
env_stack <- stack(env_stack_grd)
# show map
# mapview(obs) +
# mapview(env_stack, hide = T) # makes html too big for Github
plot(env_stack, nc=2)
absence_geo <- file.path(dir_data, "absence.geojson")
pts_geo <- file.path(dir_data, "pts.geojson")
pts_env_csv <- file.path(dir_data, "pts_env.csv")
if (!file.exists(absence_geo) | redo){
# get raster count of observations
r_obs <- rasterize(
sf::as_Spatial(obs), env_stack[[1]], field=1, fun='count')
# show map
# mapview(obs) +
# mapview(r_obs)
# create mask for
r_mask <- mask(env_stack[[1]] > -Inf, r_obs, inverse=T)
# generate random points inside mask
absence <- dismo::randomPoints(r_mask, nrow(obs)) %>%
as_tibble() %>%
st_as_sf(coords = c("x", "y"), crs = 4326)
write_sf(absence, absence_geo, delete_dsn=T)
}
absence <- read_sf(absence_geo)
# show map of presence (ie obs) and absence
mapview(obs, col.regions = "green") +
mapview(absence, col.regions = "gray")
if (!file.exists(pts_env_csv) | redo){
# combine presence and absence into single set of labeled points
pts <- rbind(
obs %>%
mutate(
present = 1) %>%
select(present, key),
absence %>%
mutate(
present = 0,
key = NA)) %>%
mutate(
ID = 1:n()) %>%
relocate(ID)
write_sf(pts, pts_geo, delete_dsn=T)
# extract raster values for points
pts_env <- raster::extract(env_stack, as_Spatial(pts), df=TRUE) %>%
tibble() %>%
# join present and geometry columns to raster value results for points
left_join(
pts %>%
select(ID, present),
by = "ID") %>%
relocate(present, .after = ID) %>%
# extract lon, lat as single columns
mutate(
#present = factor(present),
lon = st_coordinates(geometry)[,1],
lat = st_coordinates(geometry)[,2]) %>%
select(-geometry)
write_csv(pts_env, pts_env_csv)
}
pts_env <- read_csv(pts_env_csv)
pts_env %>%
select(-ID) %>%
mutate(
present = factor(present)) %>%
pivot_longer(-present) %>%
ggplot() +
geom_density(aes(x = value, fill = present)) +
scale_fill_manual(values = alpha(c("gray", "green"), 0.5)) +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
theme_bw() +
facet_wrap(~name, scales = "free") +
theme(
legend.position = c(1, 0),
legend.justification = c(1, 0))
GGally::ggpairs(
select(pts_env, -ID),
aes(color = factor(present)))
d <- pts_env %>%
select(-ID) %>% # remove terms we don't want to model
tidyr::drop_na() # drop the rows with NA values
nrow(d)
## [1] 2508
# fit a linear model
mdl <- lm(present ~ ., data = d)
summary(mdl)
##
## Call:
## lm(formula = present ~ ., data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.1038 -0.3699 0.1417 0.3169 0.9387
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.79779362 0.86250731 2.084 0.0372 *
## WC_alt 0.00007040 0.00005537 1.272 0.2037
## WC_bio1 0.05888077 0.00935076 6.297 0.000000000358 ***
## WC_bio2 -0.04023579 0.00642192 -6.265 0.000000000437 ***
## ER_tri -0.00078104 0.00051445 -1.518 0.1291
## ER_topoWet -0.03075911 0.01279483 -2.404 0.0163 *
## lon 0.04101600 0.00758717 5.406 0.000000070564 ***
## lat 0.10156525 0.00666823 15.231 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4077 on 2500 degrees of freedom
## Multiple R-squared: 0.3373, Adjusted R-squared: 0.3354
## F-statistic: 181.8 on 7 and 2500 DF, p-value: < 0.00000000000000022
y_predict <- predict(mdl, d, type="response")
y_true <- d$present
range(y_predict)
## [1] -0.4040449 1.1093685
# fit a generalized linear model with a binomial logit link function
mdl_logit <- glm(present ~ ., family = binomial(link="logit"), data = d)
summary(mdl_logit)
##
## Call:
## glm(formula = present ~ ., family = binomial(link = "logit"),
## data = d)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.70006 -0.77577 -0.05915 0.81018 2.50613
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 12.9185927 6.0093620 2.150 0.0316 *
## WC_alt -0.0001416 0.0003924 -0.361 0.7182
## WC_bio1 0.3326938 0.0604448 5.504 0.000000037108 ***
## WC_bio2 -0.1743633 0.0396090 -4.402 0.000010719951 ***
## ER_tri -0.0019532 0.0033282 -0.587 0.5573
## ER_topoWet -0.2045338 0.0804176 -2.543 0.0110 *
## lon 0.3160960 0.0516575 6.119 0.000000000941 ***
## lat 0.6855362 0.0464695 14.752 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3476.8 on 2507 degrees of freedom
## Residual deviance: 2412.1 on 2500 degrees of freedom
## AIC: 2428.1
##
## Number of Fisher Scoring iterations: 5
y_predict <- predict(mdl_logit, d, type="response")
range(y_predict)
## [1] 0.001602979 0.975640706
# show term plots
termplot(mdl_logit, partial.resid = TRUE, se = TRUE, main = F, ylim="free")
mdl_add <- mgcv::gam(
formula = present ~ s(WC_alt) + s(WC_bio1) +
s(WC_bio2) + s(ER_tri) + s(ER_topoWet) + s(lon) + s(lat),
family = binomial, data = d)
summary(mdl_add)
##
## Family: binomial
## Link function: logit
##
## Formula:
## present ~ s(WC_alt) + s(WC_bio1) + s(WC_bio2) + s(ER_tri) + s(ER_topoWet) +
## s(lon) + s(lat)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.2921 0.2985 -4.329 0.000015 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(WC_alt) 5.769 6.853 29.573 0.000132 ***
## s(WC_bio1) 7.112 8.133 45.708 < 0.0000000000000002 ***
## s(WC_bio2) 1.513 1.884 15.518 0.001884 **
## s(ER_tri) 2.709 3.500 9.752 0.026552 *
## s(ER_topoWet) 8.138 8.761 33.556 0.0000792 ***
## s(lon) 6.492 7.595 157.875 < 0.0000000000000002 ***
## s(lat) 8.478 8.847 180.041 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.645 Deviance explained = 59.4%
## UBRE = -0.40395 Scale est. = 1 n = 2508
# show term plots
plot(mdl_add, scale=0)
Most environmental variables hover around zero, jump around, and/or have large error margins. Interestingly, WC_bio2 (mean diurnal temperature range) is the most consistent as both a predictor of presence and absence: at low levels it is predictive of presence, but at high levels is predictive of absence. Longitude is largely a negative predictor, but with exceptionally low confidence at higher values.
mdl_maxent_rds <- file.path(dir_data, "mdl_maxent.rds")
# get presence-only observation points (maxent extracts raster values for you)
obs_geo <- file.path(dir_data, "obs.geojson")
obs_sp <- read_sf(obs_geo) %>%
sf::as_Spatial() # maxent prefers sp::SpatialPoints over newer sf::sf class
# fit a maximum entropy model
if (!file.exists(mdl_maxent_rds)){
mdl_mx <- maxent(env_stack, obs_sp)
readr::write_rds(mdl_mx, mdl_maxent_rds)
}
mdl_mx <- read_rds(mdl_maxent_rds)
# plot variable contributions per predictor
plot(mdl_mx)
# plot term plots
response(mdl_mx)
# predict
y_predict <- predict(env_stack, mdl_mx) #, ext=ext, progress='')
plot(y_predict, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')
Low values of WC_alt is a strong positive predictor, as is low WC_bio1. Likewise, high values of both those variables are strong negative predictors. Interestingly, Maxent identifies the opposite effect of WC_bio2 as the GAM did: here, high WC_bio2 is a positive predictor.
# read data
pts_env <- read_csv(pts_env_csv)
d <- pts_env %>%
select(-ID) %>% # not used as a predictor x
mutate(
present = factor(present)) %>% # categorical response
na.omit() # drop rows with NA
# create training set with 80% of full data
d_split <- rsample::initial_split(d, prop = 0.8, strata = "present")
d_train <- rsample::training(d_split)
# show number of rows present is 0 vs 1 in each df
table(d$present)
##
## 0 1
## 1256 1252
table(d_train$present)
##
## 0 1
## 1004 1001
# simplest tree with maxdepth = 1
mdl1 <- rpart(
present ~ ., data = d_train,
control = list(
cp = 0, minbucket = 5, maxdepth = 1))
mdl1
## n= 2005
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 2005 1001 0 (0.50074813 0.49925187)
## 2) lon>=-108.7087 645 7 0 (0.98914729 0.01085271) *
## 3) lon< -108.7087 1360 366 1 (0.26911765 0.73088235) *
# plot tree (depth=1)
par(mar = c(1, 1, 1, 1))
rpart.plot(mdl1)
# decision tree with default settings
mdl <- rpart(present ~ ., data = d_train)
mdl
## n= 2005
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 2005 1001 0 (0.50074813 0.49925187)
## 2) lon>=-108.7087 645 7 0 (0.98914729 0.01085271) *
## 3) lon< -108.7087 1360 366 1 (0.26911765 0.73088235)
## 6) lon< -112.1894 323 145 0 (0.55108359 0.44891641)
## 12) lon>=-112.3996 21 0 0 (1.00000000 0.00000000) *
## 13) lon< -112.3996 302 145 0 (0.51986755 0.48013245)
## 26) ER_topoWet< 12.9 270 118 0 (0.56296296 0.43703704)
## 52) lat< 36.9833 247 98 0 (0.60323887 0.39676113)
## 104) lat>=32.3698 204 70 0 (0.65686275 0.34313725) *
## 105) lat< 32.3698 43 15 1 (0.34883721 0.65116279) *
## 53) lat>=36.9833 23 3 1 (0.13043478 0.86956522) *
## 27) ER_topoWet>=12.9 32 5 1 (0.15625000 0.84375000) *
## 7) lon>=-112.1894 1037 188 1 (0.18129219 0.81870781)
## 14) lon>=-110.5494 274 114 1 (0.41605839 0.58394161)
## 28) lat< 31.20907 96 27 0 (0.71875000 0.28125000) *
## 29) lat>=31.20907 178 45 1 (0.25280899 0.74719101)
## 58) lat>=33.20688 25 2 0 (0.92000000 0.08000000) *
## 59) lat< 33.20688 153 22 1 (0.14379085 0.85620915) *
## 15) lon< -110.5494 763 74 1 (0.09698558 0.90301442)
## 30) lat>=34.41848 16 0 0 (1.00000000 0.00000000) *
## 31) lat< 34.41848 747 58 1 (0.07764391 0.92235609) *
rpart.plot(mdl)
# plot complexity parameter
plotcp(mdl)
# rpart cross validation results
mdl$cptable
## CP nsplit rel error xerror xstd
## 1 0.62737263 0 1.0000000 1.0879121 0.02228283
## 2 0.03296703 1 0.3726274 0.3736264 0.01742495
## 3 0.02097902 2 0.3396603 0.3526474 0.01703731
## 4 0.01598402 5 0.2767233 0.2897103 0.01573402
## 5 0.01098901 6 0.2607393 0.2447552 0.01465035
## 6 0.01000000 10 0.2087912 0.2287712 0.01422813
# caret cross validation results
mdl_caret <- train(
present ~ .,
data = d_train,
method = "rpart",
trControl = trainControl(method = "cv", number = 10),
tuneLength = 20)
ggplot(mdl_caret)
vip(mdl_caret, num_features = 40, bar = FALSE)
According to the R Documantation for plotcp(), “A good choice of cp for pruning is often the leftmost value for which the mean lies below the horizontal line.” With this in mind, a tree with 11 branches is recommended.
The variable importance chart above lists lon, lat, and WC_alt as the most important variables in determining presence/absence.
However, the nodes of the tree itself show a slightly different story: the entire tree is made up on splits of latitude and longitude, with the exception of a single node based on ER_topoWet. So while the importance of lat/long is clear, no decisions in the tree are made based on WC_alt, and ER_topoWet does show up in the tree despite being ranked as very low importance.
# number of features
n_features <- length(setdiff(names(d_train), "present"))
# fit a default random forest model
mdl_rf <- ranger(present ~ ., data = d_train)
# get out of the box RMSE
(default_rmse <- sqrt(mdl_rf$prediction.error))
## [1] 0.2928917
# re-run model with impurity-based variable importance
mdl_impurity <- ranger(
present ~ ., data = d_train,
importance = "impurity")
# re-run model with permutation-based variable importance
mdl_permutation <- ranger(
present ~ ., data = d_train,
importance = "permutation")
p1 <- vip::vip(mdl_impurity, bar = FALSE)
p2 <- vip::vip(mdl_permutation, bar = FALSE)
gridExtra::grid.arrange(p1, p2, nrow = 1)
Because Random Forest uses many trees combined to come up with the best overall solution, it may not line up with the single decision tree created above. This can be seen slightly differently in each of the variable importance charts: the permutation-based importance is in mostly the same order as the individual decision tree (only lat/lon are switched), while the impurity-based importance varies with ER_topoWet gaining importance over WC_bio2.
# read points of observation: presence (1) and absence (0)
pts <- read_sf(pts_geo)
# create training set with 80% of full data
pts_split <- rsample::initial_split(
pts, prop = 0.8, strata = "present")
pts_train <- rsample::training(pts_split)
pts_test <- rsample::testing(pts_split)
pts_train_p <- pts_train %>%
filter(present == 1) %>%
as_Spatial()
pts_train_a <- pts_train %>%
filter(present == 0) %>%
as_Spatial()
# show pairs plot before multicollinearity reduction with vifcor()
pairs(env_stack)
# calculate variance inflation factor per predictor, a metric of multicollinearity between variables
vif(env_stack)
## Variables VIF
## 1 WC_alt 3.018233
## 2 WC_bio1 2.880445
## 3 WC_bio2 1.319170
## 4 ER_tri 4.814898
## 5 ER_topoWet 5.043996
# stepwise reduce predictors, based on a max correlation of 0.7 (max 1)
v <- vifcor(env_stack, th=0.7)
v
## 2 variables from the 5 input variables have collinearity problem:
##
## ER_topoWet WC_alt
##
## After excluding the collinear variables, the linear correlation coefficients ranges between:
## min correlation ( ER_tri ~ WC_bio1 ): -0.01596287
## max correlation ( WC_bio2 ~ WC_bio1 ): -0.3281011
##
## ---------- VIFs of the remained variables --------
## Variables VIF
## 1 WC_bio1 1.140378
## 2 WC_bio2 1.262452
## 3 ER_tri 1.126835
# reduce enviromental raster stack by
env_stack_v <- usdm::exclude(env_stack, v)
# show pairs plot after multicollinearity reduction with vifcor()
pairs(env_stack_v)
mdl_maxv_rds <- file.path(dir_data, "mdl_maxv.rds")
# fit a maximum entropy model
if (!file.exists(mdl_maxv_rds)){
mdl_maxv <- maxent(env_stack_v, sf::as_Spatial(pts_train))
readr::write_rds(mdl_maxv, mdl_maxv_rds)
}
# mdl_maxv <- read_rds(mdl_maxv.rds)
mdl_maxv <- maxent(env_stack_v, sf::as_Spatial(pts_train))
## This is MaxEnt version 3.4.3
# plot variable contributions per predictor
plot(mdl_maxv)
# plot term plots
dismo::response(mdl_maxv)
# note: JVM was messed up so had to run `rJava::.jpackage("dismo")` to be able to run it
# predict
y_maxv <- predict(env_stack, mdl_maxv) #, ext=ext, progress='')
plot(y_maxv, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')
#### Variables removed due to multicolinearity; rank of remaining variables
ER_topoWet and WC_alt were removed due to multicolinearity. Of the remaining 3 variables, WC_bio2 is the most important, followed by WC_bio1, and finally ER_tri is the least important.
pts_test_p <- pts_test %>%
filter(present == 1) %>%
as_Spatial()
pts_test_a <- pts_test %>%
filter(present == 0) %>%
as_Spatial()
# y_maxv <- predict(mdl_maxv, env_stack)
# plot(y_maxv)
e <- dismo::evaluate(
p = pts_test_p,
a = pts_test_a,
model = mdl_maxv,
x = env_stack)
e
## class : ModelEvaluation
## n presences : 250
## n absences : 251
## AUC : 0.6708526
## cor : 0.3210313
## max TPR+TNR at : 0.6370805
plot(e, 'ROC')
thr <- threshold(e)[['spec_sens']]
print(paste(thr, "is the ROC threshold value"))
## [1] "0.63708045 is the ROC threshold value"
p_true <- na.omit(raster::extract(y_maxv, pts_test_p) >= thr)
a_true <- na.omit(raster::extract(y_maxv, pts_test_a) < thr)
# (t)rue/(f)alse (p)ositive/(n)egative rates
tpr <- sum(p_true)/length(p_true)
fnr <- sum(!p_true)/length(p_true)
fpr <- sum(!a_true)/length(a_true)
tnr <- sum(a_true)/length(a_true)
matrix(
c(tpr, fnr,
fpr, tnr),
nrow=2, dimnames = list(
c("present_obs", "absent_obs"),
c("present_pred", "absent_pred")))
## present_pred absent_pred
## present_obs 0.744 0.4541833
## absent_obs 0.256 0.5458167
# add point to ROC plot
points(fpr, tpr, pch=23, bg="blue")
plot(y_maxv > thr, main='Maxent binary habitat')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')
Rationale are based off my own intuition and general familiarity with the species, not an actual literature search.↩︎